Youtube comment analysis using PrimeText


In [28]:
%matplotlib


Using matplotlib backend: MacOSX

In [2]:
import sys
import pandas as pa
import numpy as np
from primetext import primetext
import matplotlib.pyplot as plt

from nltk.stem.lancaster import LancasterStemmer
from autocorrect import spell
st = LancasterStemmer()

In [3]:
pt = primetext()
ytData = pa.read_csv("utubelabled.csv",encoding ='ISO-8859-1')
comments = ytData['comment']

In [4]:
comments = comments.str.replace('','')

In [5]:
def cleanData(records,labels):
    output = []
    outputLabels = []

    recordsChecked = 0
    recordsToCheck = len(records)
    for index,sentence in enumerate(records):
        recordsChecked += 1
        sys.stdout.write("\rRecords cleaned : %i / %i" % (recordsChecked,recordsToCheck))
        cleanSentence = ''
        if len(sentence) < 200:
            words = sentence.split(' ')
            for word in words:
                if len(word) < 12:
                    if word.isalpha():
                        cleanSentence += st.stem(spell(word.lower())) + ' '
        if cleanSentence:
            output.append(cleanSentence.strip())  
            outputLabels.append(labels[index])
    sys.stdout.write("\n")
    sys.stdout.flush()
    return output,outputLabels

In [6]:
[cleanedRecords, cleanedLabels] = cleanData(comments,ytData['troll'])


Records cleaned : 3936 / 3936

In [7]:
pt.index(cleanedRecords)


Records checked : 3468
Indexed dictionary
Indexed comments

In [8]:
keyText = []
keyCount = []
for key, value in pt.indexedDictionary.items():
    c = pt.countInRecords([key])
    keyText.append(key)
    keyCount.append(c)
    
s1 = pa.Series(keyCount,index=keyText)

sortedS1  = s1.sort_values(ascending= False)[:50]

sortedS1.plot.bar()


Out[8]:
<matplotlib.axes._subplots.AxesSubplot at 0x114b08550>

In [9]:
df = pa.DataFrame(index=sortedS1.index, columns=sortedS1.index)
df = df.fillna(0)

In [10]:
names = sortedS1.index
colsdone = 0
for col in names:
    colsdone += 1
    sys.stdout.write("\rCols done : %i" % colsdone)
    for row in names:
        df[col][row] = pt.countInRecords([col,row])
sys.stdout.write("\n")
sys.stdout.flush()


Cols done : 50

In [11]:
imgplot = plt.imshow(df,interpolation="nearest")
plt.xticks( range(len(names)), names, rotation=90 )
plt.yticks( range(len(names)), names, rotation=0 )
plt.colorbar()
plt.show()



In [12]:
myLabels = pa.Series(cleanedLabels)
myLabels.sum()


Out[12]:
346.0

In [13]:
totalComments = myLabels.count()
totalTrollComments = myLabels.sum()
trollWeight = (totalComments-totalTrollComments)/totalComments
nonTrollWeight = totalTrollComments/totalComments

trollWeight, nonTrollWeight


Out[13]:
(0.90020190366310937, 0.099798096336890685)

In [14]:
# for each troll comment add the troll weight to each word
# for each non troll comment minus the nonTrollWeight from each word

In [15]:
len(pt.cleanedDictionary)


Out[15]:
3153

In [16]:
totalFoundTrolling = myLabels[pt.find(['the'])].sum()
totalFoundTrolling


Out[16]:
61.0

In [17]:
totalFoundNotTrolling = pt.find(['the']).sum() - totalFoundTrolling
totalFoundNotTrolling


Out[17]:
895.0

In [18]:
trollScore = (totalFoundTrolling * trollWeight) - (totalFoundNotTrolling * nonTrollWeight)
trollScore


Out[18]:
-34.406980098067493

In [19]:
trollScores = []
for word in pt.cleanedDictionary:
    totalFoundTrolling = myLabels[pt.find([word])].sum()
    totalFoundNotTrolling = pt.find([word]).sum() - totalFoundTrolling
    trollScore = (totalFoundTrolling * trollWeight) - (totalFoundNotTrolling * nonTrollWeight)
    trollScores.append(trollScore)

In [20]:
s2 = pa.Series(trollScores,index= pt.cleanedDictionary)

sortedPos  = s2.sort_values(ascending= True)[:100]

sortedPos.plot.bar()
print(sortedPos)


the       -34.406980
i         -22.377848
to        -17.389963
it        -13.837612
lov       -12.366888
for       -12.248918
but       -12.069513
and       -11.206519
not        -9.466686
was        -7.753966
real       -7.376983
al         -6.566484
cut        -6.287280
so         -5.945486
know       -5.884050
on         -5.733776
happy      -5.688491
good       -5.684453
get        -5.572541
my         -5.562446
jam        -5.387078
when       -5.277185
do         -4.786271
he         -4.743871
best       -4.690511
in         -4.328526
pap        -4.191520
wil        -4.083646
me         -3.880012
we         -3.782232
             ...    
wher       -2.393135
very       -2.393135
nee        -2.389097
pretty     -2.295356
wait       -2.193539
thing      -2.193539
com        -2.187482
arian      -2.095760
break      -2.095760
dog        -2.095760
find       -2.095760
keep       -2.093741
org        -2.093741
that       -2.031151
our        -1.995962
work       -1.995962
trudeau    -1.995962
respect    -1.995962
adam       -1.995962
sing       -1.995962
someon     -1.993943
from       -1.979810
his        -1.973753
last       -1.896164
must       -1.896164
xd         -1.896164
feel       -1.894145
nic        -1.894145
been       -1.892126
think      -1.886069
dtype: float64

In [29]:
s2 = pa.Series(trollScores,index= pt.cleanedDictionary)

sortedNeg  = s2.sort_values(ascending= False)[:100]

sortedNeg.plot.bar()


Out[29]:
<matplotlib.axes._subplots.AxesSubplot at 0x114aa79e8>

In [ ]:


In [22]:
def calTrollScore(comment):
    words = str(comment).split(' ')
    score = 0
    for word in words:
        if word in s2:
            score += s2[word]
    return score

In [23]:
for i in range(10):
    print(cleanedRecords[i],calTrollScore(cleanedRecords[i]))


who els think act a gre sing 12.405537929
what a lucky got celebr birthday on concert with bil of cold fan 8.98240553793
lov it -26.2044995673
no am ev knew who cord was sev year ago -7.27199307759
my birthday was the as wel and we both support west ham -70.7755985001
org im cry -3.28526103259
i do feel cold ar extrem over howev thi was would lov to see mor stuff lik thi in the -87.7213729449
whoev post process that suck al the spirit of the perform but hey sound gre on yo -96.7923276608
i do lov u we r wait for u her in kiss -46.8543409288
the best duo i lov thi -67.9186616671

In [24]:
def predictTroll(comment,theta):
    return calTrollScore(comment) > theta[0]

def costTrollPredict(theta):
    result = list(map(lambda c:predictTroll(c,theta),cleanedRecords))

In [ ]:


In [25]:
pred = costTrollPredict([30])
trueVal = list(map(lambda v: v==1.0,cleanedLabels))

In [26]:
from sklearn.metrics import f1_score

In [27]:
f1_score(trueVal, pred)


---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-27-ae7df214b052> in <module>()
----> 1 f1_score(trueVal, pred)

/Users/zackakil/anaconda/envs/dev/lib/python3.5/site-packages/sklearn/metrics/classification.py in f1_score(y_true, y_pred, labels, pos_label, average, sample_weight)
    637     return fbeta_score(y_true, y_pred, 1, labels=labels,
    638                        pos_label=pos_label, average=average,
--> 639                        sample_weight=sample_weight)
    640 
    641 

/Users/zackakil/anaconda/envs/dev/lib/python3.5/site-packages/sklearn/metrics/classification.py in fbeta_score(y_true, y_pred, beta, labels, pos_label, average, sample_weight)
    754                                                  average=average,
    755                                                  warn_for=('f-score',),
--> 756                                                  sample_weight=sample_weight)
    757     return f
    758 

/Users/zackakil/anaconda/envs/dev/lib/python3.5/site-packages/sklearn/metrics/classification.py in precision_recall_fscore_support(y_true, y_pred, beta, labels, pos_label, average, warn_for, sample_weight)
    954         raise ValueError("beta should be >0 in the F-beta score")
    955 
--> 956     y_type, y_true, y_pred = _check_targets(y_true, y_pred)
    957     present_labels = unique_labels(y_true, y_pred)
    958 

/Users/zackakil/anaconda/envs/dev/lib/python3.5/site-packages/sklearn/metrics/classification.py in _check_targets(y_true, y_pred)
     72     check_consistent_length(y_true, y_pred)
     73     type_true = type_of_target(y_true)
---> 74     type_pred = type_of_target(y_pred)
     75 
     76     y_type = set([type_true, type_pred])

/Users/zackakil/anaconda/envs/dev/lib/python3.5/site-packages/sklearn/utils/multiclass.py in type_of_target(y)
    234     if not valid:
    235         raise ValueError('Expected array-like (array or non-string sequence), '
--> 236                          'got %r' % y)
    237 
    238     if is_multilabel(y):

ValueError: Expected array-like (array or non-string sequence), got None

In [ ]:
vals = []
for i in range(-30,70):
    predt = costTrollPredict([i])
    cost = f1_score(trueVal, predt)
    vals.append(cost)
plt.plot(vals)

In [ ]:
np.asarray(costTrollPredict([50])).sum()

In [ ]:
output = np.asarray(trueVal)

In [ ]:
pred = np.asarray(costTrollPredict([0]))

In [ ]:
output.sum()

In [ ]:
len(pred)

In [ ]:
output[pred].sum()

In [ ]:
pred

In [ ]:
def calculateEffect(predFunc):
    plotLog = []
    plotFalse = []
    plotx = []
    for i in range(-100,100,5):
        pred = np.asarray(predFunc([i]))
        trollsFound = output[pred].sum()
        falsePos = (output == False)[pred].sum()
        falsePosPc =  ((100/(output==False).sum())*falsePos)  
        plotLog.append((100/output.sum())*trollsFound)
        plotFalse.append(falsePosPc)
        plotx.append(i)
    return plotLog,plotFalse,plotx

[plotLog,plotFalse,plotx] = calculateEffect(costTrollPredict)

In [ ]:
plotDiff = list(map(lambda a,b: a-b,plotLog,plotFalse ))
plt.title('Plot of % true positives against false positives (Trolls caught)')
plt.plot(plotx,plotLog,c='g')
plt.plot(plotx,plotFalse,c='r')
plt.plot(plotx,plotDiff,c='b')
plt.axvline(0,linestyle = 'dashed')

Lower solution complexity for product implimentation

only calculate based on the top polarising negative words


In [ ]:
usedNeg = sortedNeg[:20]
usedPos = sortedPos[:20]
def calTrollScoreSim(comment):
    words = str(comment).split(' ')
    score = 0
    for word in words:
        if word in usedNeg:
            score += usedNeg[word]
        elif word in usedPos:
            score += usedPos[word]
    return score

In [ ]:
def predictTroll2(comment,theta):
    return calTrollScoreSim(comment) > theta[0]

def costTrollPredict2(theta):
    result = list(map(lambda c:predictTroll2(c,theta),cleanedRecords))
    return result

In [ ]:
costTrollPredict2([0])

In [ ]:
vals = []
for i in range(0,140,5):
    predt = costTrollPredict2([i])
    cost = f1_score(trueVal, predt)
    vals.append(cost)
plt.plot(vals)

In [ ]:
[plotLog2,plotFalse2,plotx2] = calculateEffect(costTrollPredict2)

In [ ]:
plotDiff2 = list(map(lambda a,b: a-b,plotLog2,plotFalse2 ))
plt.title('Plot of % true positives against false positives (Trolls caught) using top 100 polarizing')
plt.plot(plotx2,plotLog2,c='g')
plt.plot(plotx2,plotFalse2,c='r')
plt.plot(plotx2,plotDiff2,c='b')
plt.axvline(0,linestyle = 'dashed')

In [ ]:
def calculateEffectAt0(predFunc):
    global usedNeg
    global usedPos
    plotLog = []
    plotFalse = []
    plotx = []
    for i in range(1,200,5):
        usedNeg = sortedNeg[:i]
        usedPos = sortedPos[:i]
        pred = np.asarray(predFunc([0]))
        trollsFound = output[pred].sum()
        falsePos = (output == False)[pred].sum()
        falsePosPc =  ((100/(output==False).sum())*falsePos)  
        plotLog.append((100/output.sum())*trollsFound)
        plotFalse.append(falsePosPc)
        plotx.append(i)
    return plotLog,plotFalse,plotx

In [ ]:
[plotLog3,plotFalse3,plotx3] = calculateEffectAt0(costTrollPredict2)

In [ ]:
plotDiff3 = list(map(lambda a,b: a-b,plotLog3,plotFalse3 ))
plt.title('Plot of % true positives against false positives (Trolls caught) using top x polarizing')
plt.plot(plotx3,plotLog3,c='g')
plt.plot(plotx3,plotFalse3,c='r')
plt.plot(plotx3,plotDiff3,c='b')
plt.axvline(0,linestyle = 'dashed')

In [ ]: